//	Roast+ License

//	SIMD

#ifndef __SFJP_OPENMGL_roast_simd_core_HPP__
#define __SFJP_OPENMGL_roast_simd_core_HPP__

#include <stdio.h>
#include <memory.h>

#include <nmmintrin.h>   // MMX-SSE4.2߃ZbggpꍇCN[h //
#include <smmintrin.h>   // MMX-SSE4.1߃ZbggpꍇCN[h //
#include <intrin.h>      // MMX-SSE3߃ZbggpꍇCN[h   //
#include <emmintrin.h>   // MMX-SSE2߃ZbggpꍇCN[h   //
#include <xmmintrin.h>   // MMX-SSE߃ZbggpꍇCN[h    //
#include <mmintrin.h>    // MMX߃ZbggpꍇCN[h        //

//#define _ROAST_SIMD__ENABLE_FAST_INLINE

#define _ROAST_SIMD__ALIGN_16    __declspec(align(16))

#define _ROAST_SIMD__MOVAPS(XMM_NUM,AL16FARY)    __asm movaps xmm ## XMM_NUM, xmmword ptr [AL16FARY]

//#define _AAA(N,_SIMD,_VAL)	if( xmm_num == N ){ _SIMD(N, _VAL); }
/*#define _AAA2(N,_VAL,_VAL2)	if( xmm_num == N ){ _VAL; _VAL2; }
#define _AAA3(N,_VAL,_VAL2,_VAL3)	if( xmm_num == N ){ _VAL; _VAL2; _VAL3; }
#define _AAA4(N,_VAL,_VAL2,_VAL3,_VAL4)	if( xmm_num == N ){ _VAL; _VAL2; _VAL3; _VAL4; }
#define _AAA5(N,_VAL,_VAL2,_VAL3,_VAL4,_VAL5)	if( xmm_num == N ){ _VAL; _VAL2; _VAL3; _VAL4; _VAL5; }*/

namespace roast{

	namespace simd
	{
		namespace sse
		{
			/*	4floatzXMMxixxmm_numjɊi[	*/
			//inline static void movaps(int xmm_num, _ROAST_SIMD__ALIGN_16 float f_array_4[4])
			inline void movaps(int xmm_num, float *f_array_4)
			{
				//ROAST_PP_REPEART_INC_PARAM2(10,_AAA,_ROAST_SIMD__MOVAPS,f_array_4)
				/*ROAST_PP_REPEART_INC_PARAM2(100,_AAA2,f_array_4,f_array_4)
				ROAST_PP_REPEART_INC_PARAM3(100,_AAA3,f_array_4,f_array_4,f_array_4)
				ROAST_PP_REPEART_INC_PARAM4(100,_AAA4,f_array_4,f_array_4,f_array_4,f_array_4)
				ROAST_PP_REPEART_INC_PARAM5(110,_AAA5,f_array_4,f_array_4,f_array_4,f_array_4,f_array_4)*/

				
				switch( xmm_num )
				{
				case 1: _ROAST_SIMD__MOVAPS(1, f_array_4);
					break;
				case 2: _ROAST_SIMD__MOVAPS(2, f_array_4);
					break;
				case 3: _ROAST_SIMD__MOVAPS(3, f_array_4);
					break;
				case 4: _ROAST_SIMD__MOVAPS(4, f_array_4);
					break;
				}
			}

		}//	end of SSE namespace




		template <int _SIMD_Ty>
		class simd_lambda
		{
		protected:
			int m_xmm_state[16];

			enum {
				_XMM_STATE_EMPTY = 0,
				_XMM_STATE_INT = 1,
				_XMM_STATE_FLOAT = 2,
				_XMM_STATE_DOUBLE = 3
			};

			simd_lambda* m_p_simd_lamda_left;

			/*int select_empty_xmm(){
				if ( m_xmm_state[0] == 
				//for(int i=0; i<
			}*/

		public:
			//	Constructor/Destructor
			simd_lambda(){
				::memset(m_xmm_state,0x00,sizeof(m_xmm_state));
				m_p_simd_lamda_left = NULL;
			}
			virtual ~simd_lambda(){}

			/////////////////////////////////////////////

			//	float^[h܂Bfloat_count4̔{łKv܂
			simd_lambda& load_floats(float *f_array_4, unsigned int float_count, int xmm_start=1)
			{
				//	SSEłB
				if ( _SIMD_Ty >= ROAST_PARALLELABLE_TYPE_SIMD_SSE &&
					 _SIMD_Ty <= ROAST_PARALLELABLE_TYPE_SIMD_SSE_END )
				{
					if ( float_count >= 4 )
						sse::movaps(xmm_start+0, f_array_4);
					if ( float_count >= 8 )
						sse::movaps(xmm_start+1, f_array_4+4);
					if ( float_count >= 12 )
						sse::movaps(xmm_start+2, f_array_4+8);
					if ( float_count >= 16 )
						sse::movaps(xmm_start+3, f_array_4+12);
					/*
					for(unsigned int i=0; i<float_count/4; i++)
						sse::movaps(i+1, f_array_4);
					*/
				}

				return *this;
			}

			simd_lambda& operator ,(simd_lambda& head)
			{
				*this;
			}
		};
	}
}

#endif//__SFJP_OPENMGL_roast_simd_core_HPP__
